preliminaries

Let’s load some packages

suppressPackageStartupMessages({
library('data.table')
library('dplyr')
library('ggplot2')
library('cowplot')
})

Some functions that we will find helpful:

This function loads eXpress results with all parameters (default, --fr-stranded, --rf-stranded), given a sample and the column in the quantification file.

load_column <- function(sample_name, column_name) {
  results_path <- file.path(base_dir, 'results', sample_name,
    c('paper', 'paper_forward', 'paper_reverse'),
    'express', 'results.xprs')
  res <- lapply(results_path, read.table, header = TRUE, stringsAsFactors = FALSE)
  res <- lapply(res, function(x) x[, c('target_id', column_name)])
  res <- Map(function(df, new_name) {
    data.table::setnames(df, column_name, new_name)
    df
  }, res, paste0(c('default', 'forward', 'reverse'), '_', column_name))

  Reduce(function(x, y) dplyr::inner_join(x, y, by = 'target_id'), res)
}

This function loads a specific type of eXpress results across all samples:

load_all_samples <- function(which_mode, column_name) {
  which_path <- switch(which_mode,
    default = 'paper',
    forward = 'paper_forward',
    reverse = 'paper_reverse')

  sample_names <- c('ENCLB037ZZZ', 'ENCLB038ZZZ', 'ENCLB055ZZZ', 'ENCLB056ZZZ')
  results_path <- file.path(base_dir, 'results', sample_names, which_path,
    'express', 'results.xprs')

  res <- lapply(results_path, read.table, header = TRUE, stringsAsFactors = FALSE)
  res <- lapply(res, function(x) x[, c('target_id', column_name)])
  res <- Map(function(df, new_name) {
    data.table::setnames(df, column_name, new_name)
    df
  }, res, sample_names)

  res <- Reduce(function(x, y) dplyr::inner_join(x, y, by = 'target_id'), res)
  res <- as.data.frame(res)
  rownames(res) <- res$target_id
  res$target_id <- NULL

  res
}

comparison of forward and reverse

looking at the first data set

base_dir <- '..'
current_sample <- 'ENCLB037ZZZ'
current_sample_tpm <- load_column(current_sample, 'tpm')

Let’s look at running the forward and reverse version on the first data set:

with(current_sample_tpm, cor(forward_tpm, reverse_tpm))
## [1] 0.1889952

The plot doesn’t look so great either:

ggplot(current_sample_tpm, aes(log(forward_tpm + 1), log(reverse_tpm + 1))) +
  geom_point(alpha = 0.4)

comparing the first two replicates to each other

Note: Here we will be comparing TPMs. It is not necessarily completely correct since the distribution is probably slightly different, but since they are replicates it should be more or less okay.

First let’s look at how many reads mapped:

reverse_counts <- load_all_samples('reverse', 'est_counts')
forward_counts <- load_all_samples('forward', 'est_counts')

very few reads map if you do not specify the correct strand (1.4-1.8M versus 60-75M reads)

sapply(forward_counts, sum)
## ENCLB037ZZZ ENCLB038ZZZ ENCLB055ZZZ ENCLB056ZZZ 
##     1819642     1591383     1384936     1720570
sapply(reverse_counts, sum)
## ENCLB037ZZZ ENCLB038ZZZ ENCLB055ZZZ ENCLB056ZZZ 
##    64750369    64927498    60580662    74232876

Let’s get the ordering in the original FASTA:

fasta_ordering <- system("grep '>' ../index/gencode.v16.pc_transcripts.fa | sed 's/^>//g'",
  intern = TRUE)
reverse_tpm <- load_all_samples('reverse', 'tpm')
reverse_tpm <- reverse_tpm[fasta_ordering, ]
forward_tpm <- load_all_samples('forward', 'tpm')
forward_tpm <- forward_tpm[fasta_ordering, ]
forward_plot <- ggplot(forward_tpm, aes(log(ENCLB037ZZZ + 1), log(ENCLB038ZZZ + 1))) +
  geom_point(alpha = 0.4) +
  ggtitle('running express with forward strand')

reverse_plot <- ggplot(reverse_tpm, aes(log(ENCLB037ZZZ + 1), log(ENCLB038ZZZ + 1))) +
  geom_point(alpha = 0.4) +
  ggtitle('running express with reverse strand')

Notice the correlation is not so great using the forward strand:

forward_plot

It improves greatly if you use the reverse strand:

reverse_plot

Correlations:

with(forward_tpm, cor(ENCLB037ZZZ, ENCLB038ZZZ))
## [1] 0.9509102
with(forward_tpm, cor(ENCLB037ZZZ, ENCLB038ZZZ, method = 'spearman'))
## [1] 0.7175198
with(reverse_tpm, cor(ENCLB037ZZZ, ENCLB038ZZZ))
## [1] 0.9872897
with(reverse_tpm, cor(ENCLB037ZZZ, ENCLB038ZZZ, method = 'spearman'))
## [1] 0.8958683

Prepare for submission:

It is unclear what the unit is on the website. Let’s figure it out:

tmp <- read.table('http://rafalab.rc.fas.harvard.edu/rnaseqcomp/encodeexample.txt',
  header = TRUE)

It appears to be TPM:

sapply(tmp, sum)
## ENCLB037ZZZ ENCLB038ZZZ ENCLB055ZZZ ENCLB056ZZZ 
##   1000000.7    999999.8   1000000.5   1000000.4

Write the reverse results out for submission to http://rafalab.rc.fas.harvard.edu/rnaseqbenchmark:

write.table(reverse_tpm, file = '../results/reverse_merged.tpm',
  quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)

Session info

sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.4 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] utils   stats   methods base   
## 
## other attached packages:
## [1] cowplot_0.6.2    ggplot2_2.1.0    dplyr_0.4.3      data.table_1.9.6
## [5] knitr_1.13      
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.5      grDevices_3.3.0  digest_0.6.9     assertthat_0.1  
##  [5] plyr_1.8.3       grid_3.3.0       chron_2.3-47     R6_2.1.2        
##  [9] gtable_0.2.0     DBI_0.4-1        magrittr_1.5     scales_0.4.0    
## [13] evaluate_0.9     stringi_1.1.1    graphics_3.3.0   rmarkdown_0.9.6 
## [17] labeling_0.3     tools_3.3.0      stringr_1.0.0    munsell_0.4.3   
## [21] yaml_2.1.13      parallel_3.3.0   colorspace_1.2-6 htmltools_0.3.5